from __future__ import print_function
import time
import sklearn
import sklearn.datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import lime
import lime.lime_tabular
import shap
#np.random.seed(1)
# run the machine learning model
X = sklearn.datasets.load_iris()
train,test,train_label,test_label = sklearn.model_selection.train_test_split(X.data,X.target,train_size=0.80)
model_rf = RandomForestClassifier(n_estimators=500)
model_rf.fit(train,train_label)
print(sklearn.metrics.accuracy_score(test_label, model_rf.predict(test)))
0.8666666666666667
# generate the LIME tabular explainer
start = time.clock()
lime_explainer = lime.lime_tabular.LimeTabularExplainer(train,
feature_names=X.feature_names,
class_names=X.target_names,
discretize_continuous=True)
end = time.clock()
# calculate the runtime
print('The function run time is: ',end-start,' second(s)')
The function run time is: 0.007305700000003412 second(s)
# explain a single prediction
i = np.random.randint(0, test.shape[0])
print(i)
j = np.random.randint(0, test.shape[0])
start = time.clock()
# explain the first case
exp1_rf = lime_explainer.explain_instance(test[i], model_rf.predict_proba, num_features=2, top_labels=1)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
start = time.clock()
# explain the second case
exp2_rf = lime_explainer.explain_instance(test[j], model_rf.predict_proba, num_features=2, top_labels=1)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
4 The function run time is: 2.6685693000000015 second(s) The function run time is: 2.507269600000029 second(s)
# show the explanation results
exp1_rf.show_in_notebook(show_table=True,show_all=True)
exp2_rf.show_in_notebook(show_table=True,show_all=True)
# run the machine learning model
X = sklearn.datasets.load_iris()
train,test,train_label,test_label = sklearn.model_selection.train_test_split(X.data,X.target,train_size=0.80)
model_lr = LogisticRegression()
model_lr.fit(train,train_label)
print(sklearn.metrics.accuracy_score(test_label, model_lr.predict(test)))
0.9666666666666667
# generate the LIME tabular explainer.
start = time.clock()
lime_explainer = lime.lime_tabular.LimeTabularExplainer(train,
feature_names=X.feature_names,
class_names=X.target_names,
discretize_continuous=True)
end = time.clock()
# calculate the runtime
print('The function run time is: ',end-start,' second(s)')
The function run time is: 0.007947500000000218 second(s)
# explain a single prediction
# explain the first case
start = time.clock()
exp1_lr = lime_explainer.explain_instance(test[i], model_lr.predict_proba, num_features=2, top_labels=1)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
start = time.clock()
# explain the second case
exp2_lr = lime_explainer.explain_instance(test[j], model_lr.predict_proba, num_features=2, top_labels=1)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
14 The function run time is: 2.334795799999995 second(s) The function run time is: 2.178248499999995 second(s)
# show the explanation results
exp1_lr.show_in_notebook(show_table=True,show_all=True)
exp2_lr.show_in_notebook(show_table=True,show_all=True)
# run the machine learning model
X = sklearn.datasets.load_iris()
train,test,train_label,test_label = sklearn.model_selection.train_test_split(X.data,X.target,train_size=0.80)
model_xgb = XGBClassifier(eval_metric='mlogloss')
model_xgb.fit(train,train_label)
print(sklearn.metrics.accuracy_score(test_label, model_xgb.predict(test)))
0.9333333333333333
# generate the LIME tabular explainer
start = time.clock()
lime_explainer = lime.lime_tabular.LimeTabularExplainer(train,
feature_names=X.feature_names,
class_names=X.target_names,
discretize_continuous=True)
end = time.clock()
# calculate the runtime
print('The function run time is: ',end-start,' second(s)')
The function run time is: 0.007662999999979547 second(s)
# explain a single prediction
# explain the first case
start = time.clock()
exp1_xgb = lime_explainer.explain_instance(test[i], model_xgb.predict_proba, num_features=2, top_labels=1)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
start = time.clock()
# explain the second case
exp2_xgb = lime_explainer.explain_instance(test[j], model_xgb.predict_proba, num_features=2, top_labels=1)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
3 The function run time is: 2.838890700000036 second(s) The function run time is: 2.247566899999981 second(s)
# show the explanation results
exp1_xgb.show_in_notebook(show_table=True,show_all=True)
exp2_xgb.show_in_notebook(show_table=True,show_all=True)
# run the machine learning model
X = sklearn.datasets.load_iris()
train,test,train_label,test_label = sklearn.model_selection.train_test_split(X.data,X.target,train_size=0.80)
model_knn = KNeighborsClassifier()
model_knn.fit(train,train_label)
print(sklearn.metrics.accuracy_score(test_label, model_knn.predict(test)))
0.9
# generate the LIME tabular explainer
start = time.clock()
lime_explainer = lime.lime_tabular.LimeTabularExplainer(train,
feature_names=X.feature_names,
class_names=X.target_names,
discretize_continuous=True)
end = time.clock()
# calculate the runtime
print('The function run time is: ',end-start,' second(s)')
The function run time is: 0.006422100000008868 second(s)
# explain a single prediction
# explain the first case
start = time.clock()
exp1_knn = lime_explainer.explain_instance(test[i], model_knn.predict_proba, num_features=2, top_labels=1)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
start = time.clock()
# explain the second case
exp2_knn = lime_explainer.explain_instance(test[j], model_knn.predict_proba, num_features=2, top_labels=1)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
24 The function run time is: 2.519004100000018 second(s) The function run time is: 2.333920400000011 second(s)
# show the explanation results
exp1_knn.show_in_notebook(show_table=True,show_all=True)
exp2_knn.show_in_notebook(show_table=True,show_all=True)
# run the machine learning model
train,test,train_label,test_label = sklearn.model_selection.train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)
model_rf = RandomForestClassifier(n_estimators=500)
model_rf.fit(train,train_label)
RandomForestClassifier(n_estimators=500)
# explain single prediction
shap.initjs()
start = time.clock()
# generate SHAP kernel explainer
shap_explainer = shap.KernelExplainer(model_rf.predict_proba,train)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test.iloc[12,:])
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test.iloc[12,:])
Using 120 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
The function run time is: 0.07750300000043353 second(s) The function run time is: 0.21299480000016047 second(s)
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test.iloc[29,:])
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test.iloc[29,:])
The function run time is: 0.21647029999985534 second(s)
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test)
The function run time is: 6.312022699999943 second(s)
start = time.clock()
# generate the SHAP explainer
shap_explainer = shap.Explainer(model_rf.predict,train)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# calculate the SHAP value
shap_values = shap_explainer(test)
# plot the results
shap.plots.waterfall(shap_values[0],max_display=4)
The function run time is: 0.0015423999975610059 second(s)
shap.plots.beeswarm(shap_values)
shap.plots.bar(shap_values)
# run the machine learning model
train,test,train_label,test_label = sklearn.model_selection.train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)
model_lr = LogisticRegression()
model_lr.fit(train,train_label)
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
LogisticRegression()
start = time.clock()
# generate SHAP kernel explainer
shap_explainer = shap.KernelExplainer(model_lr.predict_proba,train)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test.iloc[12,:])
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test.iloc[12,:])
Using 120 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
The function run time is: 0.003322599997773068 second(s) The function run time is: 0.04029330000048503 second(s)
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test.iloc[29,:])
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test.iloc[29,:])
The function run time is: 0.04136749999997846 second(s)
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test)
The function run time is: 1.3327983999988646 second(s)
start = time.clock()
# generate the SHAP explainer
shap_explainer = shap.Explainer(model_lr.predict,train)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# calculate the SHAP value
shap_values = shap_explainer(test)
# plot the results
shap.plots.waterfall(shap_values[12],max_display=4)
The function run time is: 0.001964099999895552 second(s)
shap.plots.beeswarm(shap_values)
shap.plots.bar(shap_values)
# run the machine learning model
train,test,train_label,test_label = sklearn.model_selection.train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)
model_xgb = XGBClassifier(eval_metric='mlogloss')
model_xgb.fit(train,train_label)
The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=4,
num_parallel_tree=1, objective='multi:softprob', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
start = time.clock()
# generate SHAP kernel explainer
shap_explainer = shap.KernelExplainer(model_xgb.predict_proba,train)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test.iloc[12,:])
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test.iloc[12,:])
Using 120 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
The function run time is: 0.39441440000155126 second(s) The function run time is: 0.08535140000094543 second(s)
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test.iloc[29,:])
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test.iloc[29,:])
The function run time is: 0.06464950000008685 second(s)
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test)
The function run time is: 2.784742699997878 second(s)
start = time.clock()
# generate the SHAP explainer
shap_explainer = shap.Explainer(model_xgb.predict,train)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# calculate the SHAP value
shap_values = shap_explainer(test)
# plot the results
shap.plots.waterfall(shap_values[12],max_display=4)
The function run time is: 0.001089900000806665 second(s)
shap.plots.beeswarm(shap_values)
shap.plots.bar(shap_values)
# run the machine learning model
train,test,train_label,test_label = sklearn.model_selection.train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)
model_knn = KNeighborsClassifier()
model_knn.fit(train,train_label)
KNeighborsClassifier()
start = time.clock()
# generate SHAP kernel explainer
shap_explainer = shap.KernelExplainer(model_knn.predict_proba,train)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test.iloc[12,:])
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test.iloc[12,:])
Using 120 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
The function run time is: 0.0091552000012598 second(s) The function run time is: 0.0520312999979069 second(s)
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test.iloc[29,:])
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test.iloc[29,:])
The function run time is: 0.05405789999895205 second(s)
start = time.clock()
# calculate the SHAP value
shap_values = shap_explainer.shap_values(test)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# plot the results
shap.force_plot(shap_explainer.expected_value[0],shap_values[0],test)
The function run time is: 1.3509833999996772 second(s)
start = time.clock()
# generate the SHAP explainer
shap_explainer = shap.Explainer(model_knn.predict,train)
end = time.clock()
print('The function run time is: ',end-start,' second(s)')
# calculate the SHAP value
shap_values = shap_explainer(test)
# plot the results
shap.plots.waterfall(shap_values[12],max_display=4)
The function run time is: 0.0023669000001973473 second(s)
shap.plots.beeswarm(shap_values)
shap.plots.bar(shap_values)